# set seed
set.seed(100)
# create initial split
splitted <- initial_split(data_clean, prop = 0.8, strata = "sentiment")
# quick check
splitted#> <11681/2919/14600>
# define preprocess recipe from train dataset
rec <- recipe(sentiment ~ ., data = training(splitted)) %>%
step_rm(-sentiment, -tweet) %>%
step_string2factor(sentiment, levels = c("negative", "neutral", "positive")) %>%
step_downsample(sentiment, ratio = 1/1, seed = 100) %>%
step_mutate(tweet = replace_kern(tweet)) %>%
step_mutate(tweet = replace_word_elongation(tweet)) %>%
step_mutate(tweet = replace_date(tweet, replacement = "<<DATE>>")) %>%
step_mutate(tweet = replace_time(tweet, replacement = "<<TIME>>")) %>%
step_mutate(tweet = replace_ordinal(tweet)) %>%
step_mutate(tweet = replace_money(tweet)) %>%
step_mutate(tweet = replace_number(tweet)) %>%
step_mutate(tweet = replace_internet_slang(tweet)) %>%
step_mutate(tweet = replace_emoji(tweet)) %>%
step_mutate(tweet = replace_contraction(tweet)) %>%
step_mutate(tweet = replace_symbol(tweet)) %>%
step_tokenize(tweet, token = "word_stems") %>%
step_stem(tweet) %>%
step_stopwords(tweet) %>%
step_tokenfilter(tweet, max_tokens = 64) %>%
step_tf(tweet, weight_scheme = "binary") %>%
prep()
# quick check
head(juice(rec), 10)# define model specification
model_spec <- rand_forest(
mode = "classification",
mtry = 2,
trees = 500,
min_n = 1
)
# define model engine
model_spec <- set_engine(
object = model_spec,
engine = "ranger",
seed = 100,
num.threads = parallel::detectCores(),
importance = "impurity"
)
# quick check
model_spec#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = 2
#> trees = 500
#> min_n = 1
#>
#> Engine-Specific Arguments:
#> seed = 100
#> num.threads = parallel::detectCores()
#> importance = impurity
#>
#> Computational engine: ranger
# fit the model
model <- fit_xy(model_spec, x = juice(rec, -sentiment), y = juice(rec, sentiment))
# quick check
model#> parsnip model object
#>
#> Ranger result
#>
#> Call:
#> ranger::ranger(formula = formula, data = data, mtry = ~2, num.trees = ~500, min.node.size = ~1, seed = ~100, num.threads = ~parallel::detectCores(), importance = ~"impurity", verbose = FALSE, probability = TRUE)
#>
#> Type: Probability estimation
#> Number of trees: 500
#> Sample size: 5661
#> Number of independent variables: 64
#> Mtry: 2
#> Target node size: 1
#> Variable importance mode: impurity
#> Splitrule: gini
#> OOB prediction error (Brier s.): 0.3411754
# get variable importance
var_imp <- tidy(model$fit$variable.importance)
# tidying
var_imp <- var_imp %>%
head(10) %>%
rename(variable = names, importance = x) %>%
mutate(variable = reorder(variable, importance))
# variable importance plot
ggplot(var_imp, aes(x = variable, y = importance)) +
geom_col(fill = "darkblue") +
coord_flip() +
labs(title = "Variables Importance (Top 10)", x = NULL, y = NULL, fill = NULL) +
scale_y_continuous(expand = expand_scale(mult = c(0, 0.1))) +
theme_minimal()# predict on test
pred_test <- bake(rec, testing(splitted)) %>%
bind_cols(predict(model, .)) %>%
bind_cols(predict(model, ., type = "prob")) %>%
select(sentiment, starts_with(".pred"))
# quick check
head(pred_test, 10)# metrics summary
pred_test %>%
summarise(
accuracy = accuracy_vec(sentiment, .pred_class),
sensitivity = sens_vec(sentiment, .pred_class),
specificity = spec_vec(sentiment, .pred_class),
precision = precision_vec(sentiment, .pred_class)
)# get roc curve data on test dataset
pred_test_roc <- pred_test %>%
roc_curve(sentiment, .pred_negative:.pred_positive)
# tidying
pred_test_roc <- pred_test_roc %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold, -.level)
# plot sensitivity-specificity trade-off
p <- ggplot(pred_test_roc, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
facet_wrap(~ .level, ncol = 1, scales = "free") +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)# get pr curve data on test dataset
pred_test_pr <- pred_test %>%
pr_curve(sentiment, .pred_negative:.pred_positive)
# tidying
pred_test_pr <- pred_test_pr %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold, -.level)
# plot recall-precision trade-off
p <- ggplot(pred_test_pr, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
facet_wrap(~ .level, ncol = 1, scales = "free") +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)